#loading packages
library(ezids)
#library(ggplot2)
library(ggrepel)
library(gridExtra)
#library(tibble)
library(tidyverse)
library(corrplot)
library(lattice)
library(psych)
library(FNN)
library(gmodels)
library(caret)
#loading data
NYweath <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))
#converting to R date format and adding columns for day, month, and year
NYweath$DATE <- as.Date(NYweath$DATE)
NYweath$day <- format(NYweath$DATE, format="%d")
NYweath$month <- format(NYweath$DATE, format="%m")
NYweath$year <- format(NYweath$DATE, format="%Y")
#converting temperature observations to numerical
NYweath$TMAX <- as.numeric(NYweath$TMAX)
NYweath$TMIN <- as.numeric(NYweath$TMIN)
NYweath$TAVG <- as.numeric(NYweath$TAVG)
NYweath$year <- as.numeric(NYweath$year)
#Making month a factor
NYweath$month <- as.factor(NYweath$month)
# subset data to desired variables
NYweath_sub <- subset(NYweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW))
#creating a subset for 1900 on
NYweath_00 <- subset(NYweath_sub, year > 1899)
xkabledplyhead(NYweath_00)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| 11265 | 1900-01-01 | 01 | 01 | 1900 | 20 | 14 | NA | 0.03 | 0.5 |
| 11266 | 1900-01-02 | 02 | 01 | 1900 | 23 | 13 | NA | 0.00 | 0.0 |
| 11267 | 1900-01-03 | 03 | 01 | 1900 | 26 | 19 | NA | 0.00 | 0.0 |
| 11268 | 1900-01-04 | 04 | 01 | 1900 | 32 | 19 | NA | 0.00 | 0.0 |
| 11269 | 1900-01-05 | 05 | 01 | 1900 | 39 | 29 | NA | 0.00 | 0.0 |
First, we format our data to create a variable representing whether it rained on a given day, starting :
# add column based on condition of rain on a given day
NYweath.rain <- NYweath_00
NYweath.rain$rained <- ifelse(NYweath.rain$PRCP > 0.0, 1, 0)
NYweath.rain$rained <- as.factor(NYweath.rain$rained)
xkablesummary(NYweath.rain)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | rained | |
|---|---|---|---|---|---|---|---|---|---|---|
| Min | Min. :1900-01-01 | Length:44829 | 01 : 3813 | Min. :1900 | Min. : 2.0 | Min. :-15.0 | Min. : 0 | Min. :0.00 | Min. : 0.0 | 0:29965 |
| Q1 | 1st Qu.:1930-09-08 | Class :character | 03 : 3813 | 1st Qu.:1930 | 1st Qu.: 47.0 | 1st Qu.: 34.0 | 1st Qu.:43 | 1st Qu.:0.00 | 1st Qu.: 0.0 | 1:14864 |
| Median | Median :1961-05-15 | Mode :character | 05 : 3813 | Median :1961 | Median : 63.0 | Median : 48.0 | Median :57 | Median :0.00 | Median : 0.0 | NA |
| Mean | Mean :1961-05-15 | NA | 07 : 3813 | Mean :1961 | Mean : 62.1 | Mean : 47.2 | Mean :56 | Mean :0.13 | Mean : 0.1 | NA |
| Q3 | 3rd Qu.:1992-01-20 | NA | 08 : 3813 | 3rd Qu.:1992 | 3rd Qu.: 78.0 | 3rd Qu.: 62.0 | 3rd Qu.:71 | 3rd Qu.:0.05 | 3rd Qu.: 0.0 | NA |
| Max | Max. :2022-09-26 | NA | 10 : 3782 | Max. :2022 | Max. :106.0 | Max. : 87.0 | Max. :92 | Max. :7.57 | Max. :27.3 | NA |
| NA | NA | NA | (Other):21982 | NA | NA | NA | NA’s :42181 | NA | NA’s :96 | NA |
xkabledplyhead(NYweath.rain)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | rained | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11265 | 1900-01-01 | 01 | 01 | 1900 | 20 | 14 | NA | 0.03 | 0.5 | 1 |
| 11266 | 1900-01-02 | 02 | 01 | 1900 | 23 | 13 | NA | 0.00 | 0.0 | 0 |
| 11267 | 1900-01-03 | 03 | 01 | 1900 | 26 | 19 | NA | 0.00 | 0.0 | 0 |
| 11268 | 1900-01-04 | 04 | 01 | 1900 | 32 | 19 | NA | 0.00 | 0.0 | 0 |
| 11269 | 1900-01-05 | 05 | 01 | 1900 | 39 | 29 | NA | 0.00 | 0.0 | 0 |
First, let’s create a two-way contingency table To study the effects on rainy day by the factor month and make sure there are no cells of zero frequencies.
rainy_vs_month = xtabs(~ rained + month, data = NYweath.rain)
rainy_vs_month
## month
## rained 01 02 03 04 05 06 07 08 09 10 11 12
## 0 2476 2297 2452 2324 2428 2393 2545 2597 2653 2770 2551 2479
## 1 1337 1177 1361 1366 1385 1297 1268 1216 1033 1012 1109 1303
We can then quickly run a chi-squared test to see if the two are independent (or same frequency distribution).
chisq.result = chisq.test(rainy_vs_month)
chisq.result
##
## Pearson's Chi-squared test
##
## data: rainy_vs_month
## X-squared = 200, df = 11, p-value <2e-16
Based on the result, the factor variables rained and
month are dependent - there is a statistically significant
relationship between the two. We can use the month variable
in our logistic regression to predict the rained outcome
for a day.
Let’s jump to the logistic regression to predict a precipitation event on a given day:
rainedLogit <- glm(rained ~ month + TMAX + TMIN, data = NYweath.rain, family = "binomial")
summary(rainedLogit)
##
## Call:
## glm(formula = rained ~ month + TMAX + TMIN, family = "binomial",
## data = NYweath.rain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.681 -0.901 -0.696 1.179 2.919
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.28924 0.05857 -4.94 7.9e-07 ***
## month02 0.03407 0.05171 0.66 0.51
## month03 0.02084 0.05141 0.41 0.69
## month04 -0.06767 0.05626 -1.20 0.23
## month05 -0.34685 0.06373 -5.44 5.3e-08 ***
## month06 -0.77503 0.07242 -10.70 < 2e-16 ***
## month07 -1.10573 0.07803 -14.17 < 2e-16 ***
## month08 -1.18707 0.07681 -15.45 < 2e-16 ***
## month09 -1.18831 0.07163 -16.59 < 2e-16 ***
## month10 -0.97082 0.06243 -15.55 < 2e-16 ***
## month11 -0.66497 0.05509 -12.07 < 2e-16 ***
## month12 -0.25122 0.05069 -4.96 7.2e-07 ***
## TMAX -0.10397 0.00210 -49.49 < 2e-16 ***
## TMIN 0.13821 0.00251 55.00 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 56958 on 44828 degrees of freedom
## Residual deviance: 53249 on 44815 degrees of freedom
## AIC: 53277
##
## Number of Fisher Scoring iterations: 4
xkabledply(rainedLogit, title = paste("Logistic Regression :", format(formula(rainedLogit)) ))
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -0.2892 | 0.0586 | -4.938 | 0.000 |
| month02 | 0.0341 | 0.0517 | 0.659 | 0.510 |
| month03 | 0.0208 | 0.0514 | 0.405 | 0.685 |
| month04 | -0.0677 | 0.0563 | -1.203 | 0.229 |
| month05 | -0.3469 | 0.0637 | -5.442 | 0.000 |
| month06 | -0.7750 | 0.0724 | -10.702 | 0.000 |
| month07 | -1.1057 | 0.0780 | -14.170 | 0.000 |
| month08 | -1.1871 | 0.0768 | -15.454 | 0.000 |
| month09 | -1.1883 | 0.0716 | -16.590 | 0.000 |
| month10 | -0.9708 | 0.0624 | -15.550 | 0.000 |
| month11 | -0.6650 | 0.0551 | -12.071 | 0.000 |
| month12 | -0.2512 | 0.0507 | -4.956 | 0.000 |
| TMAX | -0.1040 | 0.0021 | -49.485 | 0.000 |
| TMIN | 0.1382 | 0.0025 | 55.005 | 0.000 |
All the coefficients are found significant (small p-values) except for the months of February, March, and April. TMAX has a negative effect on precipitation while TMIN has a positive effect. The months with significant coefficients all have a negative effect on the rain outcome for a given day.
Let’s obtain the growth/decay factors for each explanatory variables. The factors are the exponentials of the coefficients:
expcoeff = exp(coef(rainedLogit))
xkabledply(as.table(expcoeff), title = "Exponential of coefficients in rained Logit Reg")
| x | |
|---|---|
| (Intercept) | 0.749 |
| month02 | 1.035 |
| month03 | 1.021 |
| month04 | 0.935 |
| month05 | 0.707 |
| month06 | 0.461 |
| month07 | 0.331 |
| month08 | 0.305 |
| month09 | 0.305 |
| month10 | 0.379 |
| month11 | 0.514 |
| month12 | 0.778 |
| TMAX | 0.901 |
| TMIN | 1.148 |
Compared to January, the months of February and March have a positive effect on the probability of a rainy day. Each gain in the minimum daily temperature value also has a positive effect on chances of rain.
library(ModelMetrics)
library(scales)
rainedLogit.confusion <- ModelMetrics::confusionMatrix(
actual = rainedLogit$y,
predicted = rainedLogit$fitted.values)
xkabledply(rainedLogit.confusion,
title = "Confusion matrix for logit model")
| 1 | 2 |
|---|---|
| 28093 | 11255 |
| 1872 | 3609 |
accuracy <- (rainedLogit.confusion[4] + rainedLogit.confusion[1]) / nrow(NYweath.rain)
precision <- rainedLogit.confusion[4] / (rainedLogit.confusion[4] + rainedLogit.confusion[3])
The confusion matrix above was generated for the cutoff value of
0.5.
The accuracy of this logistic model is approx. 0.71 which shows that out of all predictions made by our model on whether it rained or not, 71% were correct.
The precision value signals that it actually only rained on 24% of the days out of all days the model predicted it would rain.
rainedNullLogit <- glm(rained ~ 1, data = NYweath.rain, family = "binomial")
mcFadden = 1 - logLik(rainedLogit)/logLik(rainedNullLogit)
mcFadden
## 'log Lik.' 0.0651 (df=14)
The McFadden test outputs a pseudo-R-square value of 0.065 which demonstrates that only about 6.5% of the variation in rainy day outcomes is explained by our regressors. This is not a significant outcome for the likelihood that the model is correct at predicting rainy days.
The Hosmer and Lemeshow Goodness of Fit test can be used to evaluate logistic regression fit.
library(ResourceSelection)
# Hosmer and Lemeshow test, a chi-squared test
rainedLogitHoslem = hoslem.test(NYweath.rain$rained, fitted(rainedLogit))
rainedLogitHoslem
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: NYweath.rain$rained, fitted(rainedLogit)
## X-squared = 44829, df = 8, p-value <2e-16
The p-value of 0 is very low which indicates that the model is a good fit.
library(pROC)
probabilities <- predict(rainedLogit, type = "response")
# add probabilities as column
NYweath.rain$probs <- probabilities
rainedLogitROC <- roc(rained~probs, data=NYweath.rain)
plot(rainedLogitROC)
The area-under-curve of the ROC plot is 0.68, which is less than the significance value of 0.8. The true positive rate of our model measure is about 68%.
Overall, I think we should attempt to find a different combination of predictors for this model.
Let’s pull in our air quality data. It contains measurements of daily PM2.5 and air quality index values taken from various locations around Manhattan.
PM2.5 includes particles less than or equal to 2.5 micrometers and is also called fine particle pollution. The AQI is an index for reporting daily air quality. It tells how clean or polluted the air is, and what associated health effects might be a concern, especially for ground-level ozone and particle pollution.
Let’s load the new data and have a look at it’s structure:
DailyAQ_00_22 <- data.frame(read.csv("data/daily-AQ-NY-00-20.csv"))
DailyAQ_00_22 <- DailyAQ_00_22[c('Date', 'Daily.Mean.PM2.5.Concentration', 'DAILY_AQI_VALUE')]
colnames(DailyAQ_00_22) <- c('DATE', 'PM2.5', 'AQI')
str(DailyAQ_00_22)
## 'data.frame': 7097 obs. of 3 variables:
## $ DATE : chr "1/1/00" "1/2/00" "1/3/00" "1/4/00" ...
## $ PM2.5: num 39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
## $ AQI : int 112 115 90 60 33 74 64 82 66 40 ...
xkablesummary(DailyAQ_00_22)
| DATE | PM2.5 | AQI | |
|---|---|---|---|
| Min | Length:7097 | Min. :-1.2 | Min. : 0 |
| Q1 | Class :character | 1st Qu.: 5.9 | 1st Qu.: 25 |
| Median | Mode :character | Median : 9.0 | Median : 38 |
| Mean | NA | Mean :10.9 | Mean : 41 |
| Q3 | NA | 3rd Qu.:13.6 | 3rd Qu.: 54 |
| Max | NA | Max. :86.0 | Max. :167 |
xkabledplyhead(DailyAQ_00_22)
| DATE | PM2.5 | AQI |
|---|---|---|
| 1/1/00 | 39.8 | 112 |
| 1/2/00 | 41.3 | 115 |
| 1/3/00 | 30.8 | 90 |
| 1/4/00 | 16.4 | 60 |
| 1/5/00 | 7.8 | 33 |
We need to convert the date from a character string to an R type. We also calculate year-over-year growth rates for both daily PM2.5 and AQI and store them in a column.
We have about 7,000 observations between the years 2000 and 2022. A few plots to help us visualize the data:
# distribution plot of pmi2.5 and daily AQI
mean_pm25 <- mean(DailyAQ_00_22$PM2.5)
mean_aqi <- mean(DailyAQ_00_22$AQI)
# TODO: combine plots into one frame
ggplot(DailyAQ_00_22) +
geom_histogram(aes(x=PM2.5), na.rm=TRUE, alpha=0.5, color="black", fill='#BD2AE2', bins=100, binwidth=2) +
geom_vline(xintercept=mean_pm25, color="black", size=1, linetype=5, show.legend=FALSE) +
annotate("text", x=mean_pm25 + 9, y=1000, label=paste(round(mean_pm25, 2)), angle=0, size=4, color="black") +
labs(title="Distribution of Daily PM2.5 Measurements", x="ug/m3 LC", y="Count")
ggplot(DailyAQ_00_22) +
geom_histogram(aes(x=AQI), na.rm=TRUE, alpha=0.5, color="black", fill='#2DD164', bins=50, binwidth=5) +
geom_vline(xintercept=mean_aqi, color="black", size=1, linetype=5, show.legend=FALSE) +
annotate("text", x=mean_aqi + 9, y=625, label=paste(round(mean_aqi, 2)), angle=0, size=4, color="black") +
labs(title="Distribution of Daily AQI Level", x="", y="Count")
# TODO: group these in same figure, separate plots
ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
geom_line(aes(x = year, y = pm2.5_diffRate), na.rm = T, stat = "identity", color="#290DDA", size=1) +
geom_point(aes(x = year, y = pm2.5_diffRate), na.rm = TRUE, fill="#124CF2", shape = 23) +
labs(title="PM2.5 particulate year-over-year rate in NYC", x="Year", y="ug/m3 LC") +
theme(
axis.title.y = element_text(color = "#043008", size = 13),
axis.title.y.right = element_text(color = "#E6E930", size = 13)
)
ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
geom_line(aes(x = year, y = aqi_diffRate), na.rm = T, stat="identity", color="#043008", size=1) +
geom_point(aes(x = year, y = aqi_diffRate), na.rm = TRUE, fill="#E6E930", shape = 23) +
labs(title="AQI year-over-year rate in NYC", x="Year", y="ug/m3 LC") +
theme(
axis.title.y = element_text(color = "#043008", size = 13),
axis.title.y.right = element_text(color = "#E6E930", size = 13)
)
# ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
# labs(title="AQI and growth rate and growht/decay rate by year in NYC", x="Year", y="ug/m3 LC") +
# scale_y_continuous(sec.axis = sec_axis(~., name="Year-over-year Diff (%)")) +
# theme(
# axis.title.y = element_text(color = "#2DD164", size = 13),
# axis.title.y.right = element_text(color = "#E6E930", size = 13)
# )
Next, we combine our new dataset with the NYC weather data based on the date. The days without a matching air quality measurement will be dropped after merge.
# merge data frame by date
DailyAQ_merged <- merge(DailyAQ_00_22, NYweath_00, by="DATE")
# select required columns
DailyAQ_merged <- DailyAQ_merged[ , c('DATE', 'year.x', 'month', 'PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW')]
colnames(DailyAQ_merged)[2] <- "year"
str(DailyAQ_merged)
## 'data.frame': 7063 obs. of 9 variables:
## $ DATE : Date, format: "2000-01-01" "2000-01-02" ...
## $ year : num 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ month: Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PM2.5: num 39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
## $ AQI : int 112 115 90 60 33 74 64 82 66 40 ...
## $ TMAX : num 50 60 64 60 47 49 38 51 58 52 ...
## $ TMIN : num 34 43 51 46 29 35 30 37 44 40 ...
## $ PRCP : num 0 0 0 0.7 0 0 0 0.02 0.84 0 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
xkablesummary(DailyAQ_merged)
| DATE | year | month | PM2.5 | AQI | TMAX | TMIN | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| Min | Min. :2000-01-01 | Min. :2000 | 03 : 634 | Min. :-1.2 | Min. : 0.0 | Min. : 13.0 | Min. :-1.0 | Min. :0.00 | Min. : 0.00 |
| Q1 | 1st Qu.:2007-02-06 | 1st Qu.:2007 | 01 : 617 | 1st Qu.: 5.9 | 1st Qu.: 25.0 | 1st Qu.: 48.0 | 1st Qu.:36.0 | 1st Qu.:0.00 | 1st Qu.: 0.00 |
| Median | Median :2012-03-16 | Median :2012 | 05 : 615 | Median : 9.1 | Median : 38.0 | Median : 64.0 | Median :48.0 | Median :0.00 | Median : 0.00 |
| Mean | Mean :2011-12-20 | Mean :2011 | 04 : 611 | Mean :10.9 | Mean : 41.1 | Mean : 62.6 | Mean :48.4 | Mean :0.13 | Mean : 0.09 |
| Q3 | 3rd Qu.:2017-05-27 | 3rd Qu.:2017 | 12 : 610 | 3rd Qu.:13.6 | 3rd Qu.: 54.0 | 3rd Qu.: 79.0 | 3rd Qu.:63.0 | 3rd Qu.:0.05 | 3rd Qu.: 0.00 |
| Max | Max. :2022-09-26 | Max. :2022 | 07 : 579 | Max. :86.0 | Max. :167.0 | Max. :103.0 | Max. :83.0 | Max. :7.57 | Max. :27.30 |
| NA | NA | NA | (Other):3397 | NA | NA | NA | NA | NA | NA |
# subset to numerical variables
DailyAQ_numerics <- DailyAQ_merged[ , c('PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW', 'year')]
# combine PRCP and SNOW into single value
#DailyAQ_numerics$PRCP <- DailyAQ_numerics$PRCP + DailyAQ_numerics$SNOW
#DailyAQ_numerics <- subset(DailyAQ_numerics, select = -c(SNOW))
DailyAQ_numerics$year <- DailyAQ_numerics$year - 2000
Lattice pairs plot
pairs(DailyAQ_numerics)
pairs.panels(DailyAQ_numerics,
method = "pearson", # correlation method
hist.col = "red", # set histogram color
density = TRUE, # show density plots
ellipses = TRUE, # show correlation ellipses
smoother = TRUE,
lm = TRUE,
main = "Pairs Plot Of Weather and Air Quality Numerical Variables",
cex.labels=0.75
)
Another way to look at correlation using the
corrplot
function:
DailyAQ_cor <- cor(DailyAQ_numerics)
corrplot(DailyAQ_cor, method="number", title="Correlation Plot Of Weather and Air Quality Numerical Variables", mar = c(2, 2, 2, 2))
From the pearson correlation plot above, we can see a significantly large, positive correlation between PM2.5 concentrations and the daily AQI values. This is expected as PM2.5 are heavily weighed in calculations of AQI. Unfortunately, the correlation significance among our weather and air quality variables is relatively weak. However, we will still attempt a linear model between them below.
# yearly average and year-over year growth of daily AQI and PM2.5
ggplot(DailyAQ_00_22_Yearly_Growth) +
geom_line(aes(x = year, y = aqi_avg), stat="identity", color="#2DD164", size=1) +
geom_point(aes(x = year, y = aqi_avg), na.rm = TRUE, fill="#457108", shape = 21) +
labs(title="Average AQI by year in NYC", x="Year", y="AQI value")
ggplot(DailyAQ_00_22_Yearly_Growth) +
geom_line(aes(x = year, y = pm2.5_avg), stat="identity", color="#BD2AE2", size=1) +
geom_point(aes(x = year, y = pm2.5_avg), na.rm = TRUE, fill="#124CF2", shape = 21) +
labs(title="Average PM2.5 particulate amount by year in NYC", x="Year", y="Year-over-year Diff (%)")
Let’s start by creating a linear model to describe the relationship between AQI and year.
aqi_fit <- lm(AQI ~ year, data = DailyAQ_numerics)
summary(aqi_fit)
##
## Call:
## lm(formula = AQI ~ year, data = DailyAQ_numerics)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.34 -13.11 -1.49 11.06 126.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.4369 0.4497 136.6 <2e-16 ***
## year -1.7748 0.0342 -51.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.4 on 7061 degrees of freedom
## Multiple R-squared: 0.276, Adjusted R-squared: 0.276
## F-statistic: 2.69e+03 on 1 and 7061 DF, p-value: <2e-16
xkabledply(aqi_fit, title = paste("First Linear Model: ", format( formula(aqi_fit) )))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 61.44 | 0.4497 | 136.6 | 0 |
| year | -1.77 | 0.0342 | -51.8 | 0 |
The coefficient for the date regressor is significant, and has a negative effect on daily AQI by a very small factor of .005. Although the p-value of the F-statistic is significant, date still only explains 28% of the variability in daily AQI measurements.
ggplot(DailyAQ_00_22, aes(x = year, y = AQI)) +
geom_point(alpha = 0.5, color = "#2DD164", position = "jitter") +
labs(x = "Year", y = "AQI Value", title = "Daily AQI Values From 2000-2022 With Trend Line") +
geom_smooth(method = 'lm', formula = 'y ~ x', color = "black", fill="black")
The plot displays a slightly downward treng in daily AQI, but there is a lot of noise distorting the fit.
month as a categorical regressorIn our first analysis, we analyzed linear trends of TMAX over time and determined a slight positive correlation observed over the years 1900-2022. Based on that fit, we hypothesized that seasonality trends had an impact on model performance.
We believe seasonality also effects daily AQI measurements.
# NYC weather - Avg TMAX by month
NYweath_Monthly_Avg <- NYweath_00 %>%
group_by(month) %>%
summarize(avg_max_temp = mean(TMAX, na.rm=T),
avg_min_temp = mean(TMIN, na.rm=T))
ggplot(NYweath_Monthly_Avg, aes(x = as.numeric(month), y = avg_max_temp)) +
geom_line(color="#F21E1E", size = 2) +
geom_point(na.rm = TRUE, fill="#126BF4", shape = 21, size = 4) +
labs(title="Average TMAX By Month in NYC", x="Month", y="Temperature (°F)") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
DailyAQ_monthly <- DailyAQ_merged %>%
group_by(month) %>%
summarize(pm2.5_avg = mean(PM2.5, na.rm=T),
aqi_avg = mean(AQI, na.rm=T))
# calculate growth/decay rates month-over-month
DailyAQ_monthly <- DailyAQ_monthly %>%
mutate(pm2.5_diffRate = ((pm2.5_avg - lag(pm2.5_avg)) / pm2.5_avg) * 100,
aqi_diffRate = ((aqi_avg - lag(aqi_avg)) / aqi_avg) * 100
)
# populate January rates based on December
DailyAQ_monthly[1, 4]$pm2.5_diffRate <- ((DailyAQ_monthly$pm2.5_avg[1] - DailyAQ_monthly$pm2.5_avg[12]) / DailyAQ_monthly$pm2.5_avg[1]) * 100
DailyAQ_monthly[1, 5]$aqi_diffRate <- ((DailyAQ_monthly$aqi_avg[1] - DailyAQ_monthly$aqi_avg[12]) / DailyAQ_monthly$aqi_avg[1]) * 100
# yearly average and year-over year growth of daily AQI and PM2.5
# TODO: combine with month-over-month change plot
ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_avg)) +
geom_line(color="#47ABE9", size = 2) +
geom_point(na.rm = TRUE, fill="#C10808", shape = 21, size = 4) +
labs(title="Average AQI By Month in NYC", x="Month", y="AQI") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_diffRate)) +
geom_line(na.rm = TRUE, stat="identity", color="#043008", size=2) +
geom_point(na.rm = TRUE, fill="#E6E930", shape = 21) +
labs(title="Average AQI month-over-month change rate", x="Month", y="AQI") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
Let’s modify our last model to attempt to fit seasonality by adding
month as a categorical regressor and our
variable-of-interest from the last project - TMAX - to
predict daily AQI.
aqi_fit2 <- lm(AQI ~ TMAX + month, data = DailyAQ_merged)
summary(aqi_fit2)
##
## Call:
## lm(formula = AQI ~ TMAX + month, data = DailyAQ_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.72 -14.66 -3.03 11.52 124.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.9722 1.3444 18.58 < 2e-16 ***
## TMAX 0.6781 0.0271 25.03 < 2e-16 ***
## month02 -5.9768 1.1604 -5.15 2.7e-07 ***
## month03 -19.3975 1.1672 -16.62 < 2e-16 ***
## month04 -33.0288 1.2871 -25.66 < 2e-16 ***
## month05 -37.9619 1.4304 -26.54 < 2e-16 ***
## month06 -38.8086 1.5925 -24.37 < 2e-16 ***
## month07 -37.7940 1.6857 -22.42 < 2e-16 ***
## month08 -40.4565 1.6563 -24.43 < 2e-16 ***
## month09 -43.7962 1.5422 -28.40 < 2e-16 ***
## month10 -34.8408 1.3458 -25.89 < 2e-16 ***
## month11 -19.9973 1.2209 -16.38 < 2e-16 ***
## month12 -7.9325 1.1470 -6.92 5.1e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20 on 7050 degrees of freedom
## Multiple R-squared: 0.149, Adjusted R-squared: 0.147
## F-statistic: 103 on 12 and 7050 DF, p-value: <2e-16
xkabledply(aqi_fit2, title = paste("Second Linear Model: ", format( formula(aqi_fit2) )))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 24.972 | 1.3444 | 18.58 | 0 |
| TMAX | 0.678 | 0.0271 | 25.03 | 0 |
| month02 | -5.977 | 1.1604 | -5.15 | 0 |
| month03 | -19.398 | 1.1672 | -16.62 | 0 |
| month04 | -33.029 | 1.2871 | -25.66 | 0 |
| month05 | -37.962 | 1.4304 | -26.54 | 0 |
| month06 | -38.809 | 1.5925 | -24.37 | 0 |
| month07 | -37.794 | 1.6857 | -22.42 | 0 |
| month08 | -40.456 | 1.6563 | -24.43 | 0 |
| month09 | -43.796 | 1.5422 | -28.40 | 0 |
| month10 | -34.841 | 1.3458 | -25.89 | 0 |
| month11 | -19.997 | 1.2209 | -16.38 | 0 |
| month12 | -7.933 | 1.1470 | -6.92 | 0 |
The regression coefficient for TMAX is significant and positively
correlated, with each degree increase resulting in AQI increasing by a
factor of 0.68. All months, when compared to January, have a negative
impact on AQI, with September having the largest difference. The p-value
of the model’s F-statistic is also significant, allowing us to reject
the null hypothesis and conclude that there’s a significant relationship
between our chosen predictors and the daily AQI value. However, the
\(R^2\) for our model is only
.149, which indicates that only 14.7% of the variation in
daily AQI can be explained by TMAX and month.
Seasonality can cause a poor linear model. Properly testing it would require developing a seasonality time-series model to properly fit the data.
Check for multicollinearity
# model VIF scores
xkablevif(aqi_fit2, title = "Model 2 VIF")
| month02 | month03 | month04 | month05 | month06 | month07 | month08 | month09 | month10 | month11 | month12 | TMAX |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.78 | 1.98 | 2.32 | 2.88 | 3.36 | 3.79 | 3.59 | 3.01 | 2.38 | 1.96 | 1.84 | 4.25 |
The VIF values of all regressors are acceptable.
A k-NN model can help us further analyze the seasonality effect by
attempting to predict the month based on AQI and
TMAX variables.
Evaluate relationships via scatter plots - scale and center
data
- scatter plots of AQI,TMAX
- check composition of labels (months)
Plot variables
ggplot(DailyAQ_merged) +
geom_point(aes(x=AQI, y=TMAX, color=month), alpha = 0.7) +
labs(title = "Daily Maximum Temperature vs Daily Air Quality Index Value Distinguished By Month",
x = "Daily AQI Value",
y = "Daily Maximum Temperature (F)") +
scale_color_brewer(palette = "Paired")
Center and scale our data
DailyAQ_scaled <- as.data.frame(scale(DailyAQ_merged[4:9], center = TRUE, scale = TRUE))
str(DailyAQ_scaled)
## 'data.frame': 7063 obs. of 6 variables:
## $ PM2.5: num 3.839 4.039 2.642 0.727 -0.417 ...
## $ AQI : num 3.282 3.421 2.264 0.876 -0.373 ...
## $ TMAX : num -0.6996 -0.1462 0.0752 -0.1462 -0.8656 ...
## $ TMIN : num -0.872 -0.328 0.155 -0.147 -1.173 ...
## $ PRCP : num -0.356 -0.356 -0.356 1.502 -0.356 ...
## $ SNOW : num -0.113 -0.113 -0.113 -0.113 -0.113 ...
Create train and test data sets with 4:1 splits, as well as label sets.
set.seed(1000)
DailyAQ_sample <- sample(2, nrow(DailyAQ_scaled), replace=TRUE, prob=c(0.80, 0.20))
DailyAQ_training <- DailyAQ_scaled[DailyAQ_sample == 1, ]
DailyAQ_test <- DailyAQ_scaled[DailyAQ_sample == 2, ]
DailyAQ_trainLabels <- DailyAQ_merged[DailyAQ_sample == 1, 3]
DailyAQ_testLabels <- DailyAQ_merged[DailyAQ_sample == 2, 3]
nrow(DailyAQ_training)
## [1] 5664
nrow(DailyAQ_test)
## [1] 1399
Build kNN model
# set kval
kval <- 3
# build model
DailyAQ_pred <- knn(train = DailyAQ_training,
test = DailyAQ_test,
cl = DailyAQ_trainLabels,
k = kval)
# confusion matrix
DailyAQ_confusionMatrix <- caret::confusionMatrix(DailyAQ_pred, reference = DailyAQ_testLabels)
DailyAQ_pred_accuracy <- DailyAQ_confusionMatrix$overall['Accuracy']
xkabledply(as.matrix(DailyAQ_confusionMatrix), title = paste("ConfusionMatrix for k = ", kval))
| 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 | 10 | 11 | 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 01 | 60 | 38 | 16 | 7 | 1 | 0 | 0 | 0 | 0 | 5 | 13 | 50 |
| 02 | 24 | 33 | 38 | 10 | 0 | 0 | 0 | 0 | 0 | 4 | 18 | 18 |
| 03 | 14 | 13 | 37 | 26 | 7 | 1 | 0 | 0 | 2 | 11 | 27 | 11 |
| 04 | 2 | 3 | 20 | 52 | 22 | 6 | 2 | 2 | 3 | 36 | 25 | 6 |
| 05 | 0 | 1 | 2 | 20 | 38 | 19 | 10 | 8 | 28 | 27 | 7 | 0 |
| 06 | 0 | 0 | 0 | 1 | 8 | 39 | 34 | 25 | 25 | 7 | 1 | 0 |
| 07 | 0 | 0 | 0 | 0 | 4 | 15 | 41 | 34 | 13 | 3 | 0 | 0 |
| 08 | 0 | 0 | 0 | 2 | 2 | 15 | 23 | 24 | 13 | 3 | 0 | 0 |
| 09 | 0 | 0 | 1 | 1 | 9 | 14 | 9 | 10 | 17 | 4 | 1 | 0 |
| 10 | 0 | 3 | 2 | 6 | 8 | 5 | 1 | 0 | 3 | 24 | 8 | 1 |
| 11 | 2 | 1 | 9 | 6 | 4 | 1 | 0 | 0 | 0 | 6 | 15 | 6 |
| 12 | 16 | 13 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 2 | 5 | 19 |
xkabledply(data.frame(DailyAQ_confusionMatrix$byClass), title=paste("k = ", kval))
| Sensitivity | Specificity | Pos.Pred.Value | Neg.Pred.Value | Precision | Recall | F1 | Prevalence | Detection.Rate | Detection.Prevalence | Balanced.Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Class: 01 | 0.508 | 0.898 | 0.316 | 0.952 | 0.316 | 0.508 | 0.390 | 0.0843 | 0.0429 | 0.1358 | 0.704 |
| Class: 02 | 0.314 | 0.913 | 0.228 | 0.943 | 0.228 | 0.314 | 0.264 | 0.0751 | 0.0236 | 0.1036 | 0.614 |
| Class: 03 | 0.276 | 0.911 | 0.248 | 0.922 | 0.248 | 0.276 | 0.262 | 0.0958 | 0.0264 | 0.1065 | 0.594 |
| Class: 04 | 0.388 | 0.900 | 0.290 | 0.933 | 0.290 | 0.388 | 0.332 | 0.0958 | 0.0372 | 0.1279 | 0.644 |
| Class: 05 | 0.369 | 0.906 | 0.237 | 0.948 | 0.237 | 0.369 | 0.289 | 0.0736 | 0.0272 | 0.1144 | 0.637 |
| Class: 06 | 0.339 | 0.921 | 0.279 | 0.940 | 0.279 | 0.339 | 0.306 | 0.0822 | 0.0279 | 0.1001 | 0.630 |
| Class: 07 | 0.342 | 0.946 | 0.373 | 0.939 | 0.373 | 0.342 | 0.356 | 0.0858 | 0.0293 | 0.0786 | 0.644 |
| Class: 08 | 0.233 | 0.955 | 0.293 | 0.940 | 0.293 | 0.233 | 0.260 | 0.0736 | 0.0172 | 0.0586 | 0.594 |
| Class: 09 | 0.164 | 0.962 | 0.258 | 0.935 | 0.258 | 0.164 | 0.200 | 0.0743 | 0.0122 | 0.0472 | 0.563 |
| Class: 10 | 0.182 | 0.971 | 0.393 | 0.919 | 0.393 | 0.182 | 0.249 | 0.0944 | 0.0172 | 0.0436 | 0.576 |
| Class: 11 | 0.125 | 0.973 | 0.300 | 0.922 | 0.300 | 0.125 | 0.176 | 0.0858 | 0.0107 | 0.0357 | 0.549 |
| Class: 12 | 0.171 | 0.963 | 0.284 | 0.931 | 0.284 | 0.171 | 0.213 | 0.0793 | 0.0136 | 0.0479 | 0.567 |
How does k affect classification accuracy?
evaluateK = function(k, train_set, val_set, train_class, val_class){
# Build knn with k neighbors considered.
set.seed(1000)
class_knn = knn(train = train_set, #<- training set cases
test = val_set, #<- test set cases
cl = train_class, #<- category for classification
k = k) #<- number of neighbors considered
tab = table(class_knn, val_class)
# Calculate the accuracy.
accu = sum(tab[row(tab) == col(tab)]) / sum(tab)
cbind(k = k, accuracy = accu)
}
# call evaluateK function for each odd k-value between 1 to 21
knn_different_k = sapply(seq(1, 21, by = 2),
function(x) evaluateK(x,
train_set = DailyAQ_training,
val_set = DailyAQ_test,
train_class = DailyAQ_trainLabels,
val_class = DailyAQ_testLabels))
# Reformat the results
knn_different_k = data.frame(k = knn_different_k[1,],
accuracy = knn_different_k[2,])
ggplot(knn_different_k, aes(x = k, y = accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3) +
labs(title = "kNN Model Accuracy vs k-value",
x = "Model k-value",
y = "Accuracy")
It seems 13-nearest neighbors is a decent choice because that’s the
greatest improvement in predictive accuracy before the incremental
improvement trails off. With an accuracy of 0.32, our model predicting
month based on TMAX and AQI is not a strong
fit.